#######################################################################
#######################################################################
######## JACK BLUMENAU, MATTHEW KIM, & DAVID ROMNEY  RCODE ############
#######################################################################
#######################################################################

#####################
#### Load packages
#####################
library(countrycode)
library(ROCR)
library(pROC)
library(stargazer)
library(foreign)
library(Hmisc)
library(BMA)
library(Matching)
library(rms)
library(mvtnorm)
library(boot)
library(dplyr)
library(Amelia)
library(MatchingFrontier)
library(ggplot2)
library(Zelig)
library(countrycode)
library(data.table)
require(mvtnorm)
library(rms)
require(boot)
library(data.table)
library(gridExtra)
library(zoo)

########################### 
#### Set working directory
########################### 
##setwd("")

##########################################################
#### Create function for clustered robust standard errors
##########################################################
## vcovCluster.r
## function to compute var-cov matrix using clustered robust standard errors
## inputs:
## model = model object from call to lm or glm
## cluster = vector with cluster ID indicators
## output:
## cluster robust var-cov matrix
## to call this for a model directly use:
## coeftest(model,vcov = vcovCluster(model, cluster))
## formula is similar to Stata's , cluster command

vcovCluster <- function(
    model,
    cluster
    )
{
    require(sandwich)
    require(lmtest)
    if(nrow(model.matrix(model)) != length(cluster)){
        stop("Check your data: Cluster variable has different N than model")
    }
    M <- length(unique(cluster))
    N <- length(cluster)
    K <- model$rank
    if(M < 50){
        warning("Fewer than 50 clusters, variances may be unreliable (could try block bootstrap instead).")
    }
    dfc <- (M/(M - 1)) * ((N - 1)/(N - K))
    uj  <- apply(estfun(model), 2, function(x) tapply(x, cluster, sum));
    rcse.cov <- dfc * sandwich(model, meat = crossprod(uj)/N)
    return(rcse.cov)
}

##################################################################
#### Create function for standardizing component parts of indices
##################################################################
zero_to_one <- function(x, na.rm = T) {
  ((x - min(x, na.rm = T)) / (max(x, na.rm = T) - min(x,na.rm = T))) + 0.0001
}

##################### 
### Merge Datasets
#####################
##Load Warren's data
reg_data <- read.dta("Warren_IO_reg_data.dta")

##Load CINC data
cinc <- read.csv("cinc.csv")
cinc <- tbl_df(cinc)
reg_data <- tbl_df(reg_data)
cinc <- select(cinc, -stateabb, -tpop, -version)
cinc$cowcode <- cinc$ccode
cinc$ccode <- NULL
cinc$milex[cinc$milex == -9] <- NA
cinc$milper[cinc$milper == -9] <- NA

## Create id to see which have dropped
reg_data$id <- 1:nrow(reg_data)

## Merge
merged <- merge(reg_data, cinc, all = TRUE)
merged <- merged[!(is.na(merged$id)), ]

############################
### Create M-score
############################
## Standardize component hard power terms
merged$milper01<-zero_to_one(merged$milper)
merged$milex01<-zero_to_one(merged$milex)
merged$energy01<-zero_to_one(merged$energy)
merged$irst01<-zero_to_one(merged$irst)

## Sum and divide by logged population
merged$M<-((merged$milper01+merged$milex01+merged$energy01+merged$irst01)/merged$lpopl)

## Standardize component soft power terms
merged$newsli01<-zero_to_one(merged$newsli)
merged$radioli01<-zero_to_one(merged$radioli)
merged$tvli01<-zero_to_one(merged$tvli)

## Sum and divide by logged population
merged$mdi_std<-((merged$newsli01+merged$radioli01+merged$tvli01)/merged$lpopl)

#################################
### Create squared term for MDI
#################################
merged$mdi_std2<-merged$mdi_std^2

############################################
### Create 5 year rolling onset region mean
############################################
## Add region variable
merged$region<-countrycode(merged$cow,"cown","region")

# Add missing values for region
merged$region[merged$country=="Germany"]<-"Western Europe"
merged$region[merged$country=="Czechoslovakia"]<-"Eastern Europe"
merged$region[merged$country=="Yugoslavia"]<-"Eastern Europe"
merged$region[merged$country=="Yemen AR"]<-"Western Asia"
merged$region[merged$country=="Yemen PR"]<-"Western Asia"
merged$region[merged$country=="Taiwan"]<-"Eastern Asia"

## Convert to data.table for aggregation
merged <-data.table(merged)

## Count onset by region, by year
yearly.region.onset<-merged[,
                            sum(onset),by=list(region,year)
                            ]

## Sort results
setkey(yearly.region.onset,region,year)

x<-yearly.region.onset[,
                       rollapply(V1,6,function(x)sum(x[-length(x)]),
                                 partial=T,align="right"),
                       by=list(region)
                       ]

## Find number of countries, per region, per year
n.countries<-merged[,
                    length(cowcode),by=list(region,year)
                    ]
setkey(n.countries,region,year)

## Bind together
yearly.region.onset<-cbind(yearly.region.onset,x$V1, n.countries$V1)
yearly.region.onset$V3<-n.countries$V1
names(yearly.region.onset)[3:5]<-c("yeartotal","rollingtotal","ncountries")

## Calculate mean rolling average
yearly.region.onset$regiononsetmean<-yearly.region.onset$rollingtotal/yearly.region.onset$ncountries

## Merge back into merged.df
merged<-merge(merged, yearly.region.onset,by=c("year","region"))
merged<-data.frame(merged)
names(merged)[2]<-"region"

##################
## Figure 1
##################
pdf("mdihist.pdf")
par(mfrow=c(1,1))
hist(merged$mdi_std,breaks=50,prob=F,main="", xlab="Standardized MDI", col=rgb(0,0,1,1/4))
dev.off()

##################
## Figure 2
##################
pdf("mscorehist.pdf")
par(mfrow=c(1,1))
hist(merged$M,breaks=100,prob=F,main="",xlab="Standardized M-Score", col=rgb(0,0,1,1/4))
dev.off()

##################
## Figure 3
##################
pdf("regiononsethist.pdf")
par(mfrow=c(1,1))
hist(merged$regiononsetmean,breaks=40,prob=F,main="",xlab="Lagged Regional Onset", col=rgb(0,0,1,1/4))
dev.off()

##################
#### Table 1
##################
# Model 1 - warren
model.1 <- glm(onset ~ mdi_std + lgdpl + larea + lmtn + lpopl + oil2l + deml +
               deml2 + ethfracl + relfracl + pcyrs + spline1 + spline2 +
               spline3, data = merged, family = binomial(logit))
standard.errors.1 <- sqrt(diag(vcovCluster(
    model.1, merged$cowcode[-model.1$na.action]
    )))

# Model 2 - BKR
model.2 <- glm(onset ~  mdi_std + M + lgdpl +larea + lmtn + lpopl + oil2l + deml  +
               deml2 +ethfracl + relfracl + pcyrs + spline1 + spline2 +
               spline3, data = merged, family = binomial(logit))
standard.errors.5 <- sqrt(diag(vcovCluster(
    model.2, merged$cowcode[-model.2$na.action]
    )))

# Model 3 - BKR
model.3 <- glm(onset ~ mdi_std +M + mdi_std2 +  lgdpl + larea + lmtn + lpopl + oil2l + deml  +
               deml2 +ethfracl + relfracl + pcyrs + spline1 + spline2 +
               spline3, data = merged, family = binomial(logit))
standard.errors.6 <- sqrt(diag(vcovCluster(
    model.3, merged$cowcode[-model.3$na.action]
    )))

# Model 4 - BKR
model.4 <- glm(onset ~ mdi_std + M +  mdi_std2 + regiononsetmean+  lgdpl + larea + lmtn + lpopl + oil2l + deml  +
               deml2 +ethfracl + relfracl + pcyrs + spline1 + spline2 +
               spline3, data = merged, family = binomial(logit))
standard.errors.4 <- sqrt(diag(vcovCluster(
    model.4, merged$cowcode[-model.4$na.action]
    )))

#Create table
stargazer(model.1, model.2, model.3, model.4,
          se = list(
          standard.errors.1,
          standard.errors.5, standard.errors.6,
          standard.errors.4
          ),font.size="tiny",omit="region")

##################
## Figure 4
##################
## Get model and variance covariance matrix
model.4 <- glm(onset ~ M + mdi_std + mdi_std2 + regiononsetmean+  lgdpl + larea + lmtn + lpopl + oil2l + deml  +
               deml2 +ethfracl + relfracl + pcyrs + spline1 + spline2 +
               spline3, data = merged, family = binomial(logit))
vcovmat.4 <- vcovCluster(model.4, merged$cowcode[-model.4$na.action])

## Simulate the coefficients
set.seed(02138)
nsims <- 1000000
sims.4 <- rmvnorm(n = nsims, mean = model.4$coefficients, sigma = vcovmat.4)

## Get the means for the variables in the regression
vars <- names(model.4$model)[!(names(model.4$model) %in% "onset")]
means.max.4 <- colMeans(merged[, vars], na.rm = TRUE)
means.min.4 <- colMeans(merged[, vars], na.rm = TRUE)

means.max.4[4]<-max(merged$regiononsetmean)
means.min.4[4]<-min(merged$regiononsetmean)

means.max.4<-c(1, means.max.4)
means.min.4<-c(1, means.min.4)

pred.prob.max<-inv.logit(means.max.4 %*% t(sims.4))
pred.prob.min<-inv.logit(means.min.4 %*% t(sims.4))

draws.max <- rbinom(n = nsims, size = 1, prob = as.numeric(pred.prob.max))
draws.min <- rbinom(n = nsims, size = 1, prob = as.numeric(pred.prob.min))

mean(draws.max) - mean(draws.min)

#Create figure
pdf(file = "regionalonseteffectshist.pdf")
hist(pred.prob.max, breaks = 90, col=rgb(1,0,0,1/4),
     main = "", xlab = "Predicted probability")
hist(pred.prob.min,breaks=25,col=rgb(0,0,1,1/4),add=T)
segments(c(0.012, 0.033),
         c(60000, 50000),
         c(0.10, 0.1),
         c(65000, 60000),
         lty = 3)
text(0.10+0.05, 65000, paste("Min. lagged regional onset (mean=0.008569)"))
text(0.10+0.05, 60000, paste("Max. lagged regional onset (mean=0.032323)"))
dev.off()

##################
## Figure 6
##################
## Get the means for the variables in the regression
vars <- names(model.4$model)[!(names(model.4$model) %in% "onset")]
means.4 <- colMeans(merged[, vars], na.rm = TRUE)

pred.range.df<-data.frame(matrix(NA,length(c(seq(0,1,0.01))),3))

means.4 <- colMeans(merged[, vars], na.rm = TRUE)
means.4<-c(1, means.4)

for (i in 1:length(c(seq(0,1,0.01)))) {
    print(i/length(c(seq(0,1,0.01))))

    means.4[5]<-c(seq(0,1,0.01))[i]

    pred.prob<-inv.logit(means.4 %*% t(sims.4))

    pred.mean<-mean(pred.prob)
    pred.cis<-quantile(pred.prob,c(0.025,0.975))

    pred.range.df[i,] <- c(pred.mean,pred.cis)
}
pred.range.df <- cbind(c(seq(0,1,0.01)),pred.range.df)
names(pred.range.df)<-c("Regionalonset","Mean","Lci","Uci")

pdf("regionalonseteffectplot.pdf")
plot(pred.range.df$Regionalonset,pred.range.df$Mean,type="l",ylim=c(0,0.05),main="",ylab="Probability of onset",xlab="5-year lagged regional onset",lwd=2)
lines(pred.range.df$Regionalonset,pred.range.df$Lci,lty=3,lwd=2)
lines(pred.range.df$Regionalonset,pred.range.df$Uci,lty=3,lwd=2)
dev.off()

##################
## Figure 5
##################
## Remove missing data (necessary in order to run makeFrontier function)
datatomatch <- merged[complete.cases(merged), ]

## Identify variables to drop/keep when matching
drops <- c("year", "region", "cowcode", "country", "onset", "atwar",
           "mdi_p50", "newsli", "radioli", "tvli", "lgdpl_p50", "telephli",
           "litli", "seduli", "pfl", "deml2", "id", "irst", "milex", "milper",
           "energy", "upop", "cinc", "milper01", "milex01", "energy01",
           "irst01", "newsli01", "radioli01", "tvli01", "mdi_std", "mdi_std2",
           "mdi2", "yeartotal", "rollingtotal", "ncountries")
keeps <- factor(names(datatomatch)[!(names(datatomatch) %in% drops)])

## Make the frontier
myfrontier <- makeFrontier(
    treatment = "mdi_p50",
    dataset = datatomatch,
    drop = drops,
    QOI = "FSATT",
    metric = "Mahal",
    ratio = "variable")

## Get the frontier estimates
myEst <- frontierEst(myfrontier, datatomatch,
                     myform = formula("onset ~ mdi_p50"),
                     treatment = "mdi_p50")

## Combine plots
pdf("FrontierPlotMDI.pdf")
p1 <- plotFrontier(myfrontier, datatomatch, drop = drops) +
    theme_bw() +
    labs(title = "") +
    theme(text = element_text(family = "serif", vjust = 0.4))
p2 <- plotEffects(myfrontier, datatomatch, myEst, drop = drops) +
    theme_bw() +
    theme(text = element_text(family = "serif", vjust = 0.4))
p3 <- plotMeans(myfrontier, datatomatch, myEst, drop = drops) +
    theme_bw() +
    labs(fill = keeps, colour = "Variable") +
    theme(
        text = element_text(family = "serif", vjust = 0.4)
        )
p <- grid.arrange(arrangeGrob(p1, p2, ncol = 1), p3, ncol = 2)
print(p)
dev.off()
